{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "from binascii import hexlify"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 208,
   "metadata": {},
   "outputs": [],
   "source": [
    "def topk(filename=\"bn.npy\", k=100000):\n",
    "    d = np.load(filename)\n",
    "    d = d[d['n-gram'] != b'']\n",
    "    d = pd.DataFrame(d)\n",
    "    d['counter'] = abs(d['counter'])\n",
    "    d.sort_values(by='counter',ascending=False, inplace=True)\n",
    "    top = d.head(k)\n",
    "    if top.duplicated('n-gram').any():\n",
    "        print(\"duplicated found !!!\")\n",
    "    return top"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 215,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len(s1)=100000\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>counter</th>\n",
       "      <th>n-gram</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>63058</th>\n",
       "      <td>93565</td>\n",
       "      <td>b';q=0'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107971</th>\n",
       "      <td>93565</td>\n",
       "      <td>b'q=0.'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73702</th>\n",
       "      <td>74856</td>\n",
       "      <td>b'cept'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>77446</th>\n",
       "      <td>74852</td>\n",
       "      <td>b'Acce'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9763</th>\n",
       "      <td>74852</td>\n",
       "      <td>b'ccep'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31992</th>\n",
       "      <td>60320</td>\n",
       "      <td>b'tion'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34395</th>\n",
       "      <td>56165</td>\n",
       "      <td>b'ache'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>94747</th>\n",
       "      <td>56139</td>\n",
       "      <td>b'ept-'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44032</th>\n",
       "      <td>56139</td>\n",
       "      <td>b'xml,'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109283</th>\n",
       "      <td>56139</td>\n",
       "      <td>b'text'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55167</th>\n",
       "      <td>56139</td>\n",
       "      <td>b'=0.5'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>76346</th>\n",
       "      <td>56139</td>\n",
       "      <td>b'ext/'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>78510</th>\n",
       "      <td>56137</td>\n",
       "      <td>b'ost:'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>72897</th>\n",
       "      <td>41650</td>\n",
       "      <td>b'lica'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>108729</th>\n",
       "      <td>41606</td>\n",
       "      <td>b'atio'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102118</th>\n",
       "      <td>41592</td>\n",
       "      <td>b'plic'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55025</th>\n",
       "      <td>41590</td>\n",
       "      <td>b'icat'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110548</th>\n",
       "      <td>41588</td>\n",
       "      <td>b'cati'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91534</th>\n",
       "      <td>41588</td>\n",
       "      <td>b'appl'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>108406</th>\n",
       "      <td>41586</td>\n",
       "      <td>b'ion/'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25423</th>\n",
       "      <td>41586</td>\n",
       "      <td>b'on/x'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>82627</th>\n",
       "      <td>41586</td>\n",
       "      <td>b'ppli'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31857</th>\n",
       "      <td>37453</td>\n",
       "      <td>b'host'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101781</th>\n",
       "      <td>37443</td>\n",
       "      <td>b'8080'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26894</th>\n",
       "      <td>37437</td>\n",
       "      <td>b'lhos'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69305</th>\n",
       "      <td>37437</td>\n",
       "      <td>b'alho'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107604</th>\n",
       "      <td>37435</td>\n",
       "      <td>b'loca'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25297</th>\n",
       "      <td>37435</td>\n",
       "      <td>b'ocal'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92957</th>\n",
       "      <td>37434</td>\n",
       "      <td>b'late'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59110</th>\n",
       "      <td>37430</td>\n",
       "      <td>b'cach'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>67773</th>\n",
       "      <td>11</td>\n",
       "      <td>b'E10D'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>67689</th>\n",
       "      <td>11</td>\n",
       "      <td>b'1459'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70241</th>\n",
       "      <td>11</td>\n",
       "      <td>b'8489'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109815</th>\n",
       "      <td>11</td>\n",
       "      <td>b'271E'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96635</th>\n",
       "      <td>11</td>\n",
       "      <td>b'E9DA'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>67786</th>\n",
       "      <td>11</td>\n",
       "      <td>b'66A7'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14515</th>\n",
       "      <td>11</td>\n",
       "      <td>b'55C7'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105930</th>\n",
       "      <td>11</td>\n",
       "      <td>b'5270'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>67768</th>\n",
       "      <td>11</td>\n",
       "      <td>b'5A84'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>72580</th>\n",
       "      <td>11</td>\n",
       "      <td>b'D73F'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65236</th>\n",
       "      <td>11</td>\n",
       "      <td>b'C788'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11850</th>\n",
       "      <td>11</td>\n",
       "      <td>b'A2DB'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14430</th>\n",
       "      <td>11</td>\n",
       "      <td>b'BAA0'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105116</th>\n",
       "      <td>11</td>\n",
       "      <td>b'124A'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105095</th>\n",
       "      <td>11</td>\n",
       "      <td>b'A665'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8971</th>\n",
       "      <td>11</td>\n",
       "      <td>b'FBCA'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19730</th>\n",
       "      <td>11</td>\n",
       "      <td>b'31A5'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14410</th>\n",
       "      <td>11</td>\n",
       "      <td>b'9F7E'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51803</th>\n",
       "      <td>11</td>\n",
       "      <td>b'EC93'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101035</th>\n",
       "      <td>11</td>\n",
       "      <td>b'4652'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105092</th>\n",
       "      <td>11</td>\n",
       "      <td>b'C577'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19728</th>\n",
       "      <td>11</td>\n",
       "      <td>b'FA50'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14518</th>\n",
       "      <td>11</td>\n",
       "      <td>b'6B83'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105993</th>\n",
       "      <td>11</td>\n",
       "      <td>b'099B'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11074</th>\n",
       "      <td>11</td>\n",
       "      <td>b'72C5'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22991</th>\n",
       "      <td>11</td>\n",
       "      <td>b'DCB8'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106029</th>\n",
       "      <td>11</td>\n",
       "      <td>b'A781'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63425</th>\n",
       "      <td>11</td>\n",
       "      <td>b'A994'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18500</th>\n",
       "      <td>11</td>\n",
       "      <td>b'0BF4'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109367</th>\n",
       "      <td>11</td>\n",
       "      <td>b'A00E'</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>26125 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        counter   n-gram\n",
       "63058     93565  b';q=0'\n",
       "107971    93565  b'q=0.'\n",
       "73702     74856  b'cept'\n",
       "77446     74852  b'Acce'\n",
       "9763      74852  b'ccep'\n",
       "31992     60320  b'tion'\n",
       "34395     56165  b'ache'\n",
       "94747     56139  b'ept-'\n",
       "44032     56139  b'xml,'\n",
       "109283    56139  b'text'\n",
       "55167     56139  b'=0.5'\n",
       "76346     56139  b'ext/'\n",
       "78510     56137  b'ost:'\n",
       "72897     41650  b'lica'\n",
       "108729    41606  b'atio'\n",
       "102118    41592  b'plic'\n",
       "55025     41590  b'icat'\n",
       "110548    41588  b'cati'\n",
       "91534     41588  b'appl'\n",
       "108406    41586  b'ion/'\n",
       "25423     41586  b'on/x'\n",
       "82627     41586  b'ppli'\n",
       "31857     37453  b'host'\n",
       "101781    37443  b'8080'\n",
       "26894     37437  b'lhos'\n",
       "69305     37437  b'alho'\n",
       "107604    37435  b'loca'\n",
       "25297     37435  b'ocal'\n",
       "92957     37434  b'late'\n",
       "59110     37430  b'cach'\n",
       "...         ...      ...\n",
       "67773        11  b'E10D'\n",
       "67689        11  b'1459'\n",
       "70241        11  b'8489'\n",
       "109815       11  b'271E'\n",
       "96635        11  b'E9DA'\n",
       "67786        11  b'66A7'\n",
       "14515        11  b'55C7'\n",
       "105930       11  b'5270'\n",
       "67768        11  b'5A84'\n",
       "72580        11  b'D73F'\n",
       "65236        11  b'C788'\n",
       "11850        11  b'A2DB'\n",
       "14430        11  b'BAA0'\n",
       "105116       11  b'124A'\n",
       "105095       11  b'A665'\n",
       "8971         11  b'FBCA'\n",
       "19730        11  b'31A5'\n",
       "14410        11  b'9F7E'\n",
       "51803        11  b'EC93'\n",
       "101035       11  b'4652'\n",
       "105092       11  b'C577'\n",
       "19728        11  b'FA50'\n",
       "14518        11  b'6B83'\n",
       "105993       11  b'099B'\n",
       "11074        11  b'72C5'\n",
       "22991        11  b'DCB8'\n",
       "106029       11  b'A781'\n",
       "63425        11  b'A994'\n",
       "18500        11  b'0BF4'\n",
       "109367       11  b'A00E'\n",
       "\n",
       "[26125 rows x 2 columns]"
      ]
     },
     "execution_count": 215,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "topk1 = topk(\"bn.1m.npy\", 100000) #topk_of_1m\n",
    "s1 = set([i for i in topk1['n-gram']])\n",
    "print(\"len(s1)=%ld\" % len(s1))\n",
    "topk1[topk1['counter'] > 10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len(s2)=10000\n",
      "len(s2.intersection(s1))=9624\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>counter</th>\n",
       "      <th>n-gram</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8900</th>\n",
       "      <td>93565</td>\n",
       "      <td>b'q=0.'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8657</th>\n",
       "      <td>93565</td>\n",
       "      <td>b';q=0'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11474</th>\n",
       "      <td>74856</td>\n",
       "      <td>b'cept'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5501</th>\n",
       "      <td>74852</td>\n",
       "      <td>b'Acce'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4476</th>\n",
       "      <td>74852</td>\n",
       "      <td>b'ccep'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2849</th>\n",
       "      <td>60320</td>\n",
       "      <td>b'tion'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10750</th>\n",
       "      <td>56165</td>\n",
       "      <td>b'ache'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6633</th>\n",
       "      <td>56139</td>\n",
       "      <td>b'ept-'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4399</th>\n",
       "      <td>56139</td>\n",
       "      <td>b'=0.5'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4795</th>\n",
       "      <td>56139</td>\n",
       "      <td>b'ext/'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10115</th>\n",
       "      <td>56139</td>\n",
       "      <td>b'xml,'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7927</th>\n",
       "      <td>56139</td>\n",
       "      <td>b'text'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10200</th>\n",
       "      <td>56137</td>\n",
       "      <td>b'ost:'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1008</th>\n",
       "      <td>41650</td>\n",
       "      <td>b'lica'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11236</th>\n",
       "      <td>41606</td>\n",
       "      <td>b'atio'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>624</th>\n",
       "      <td>41592</td>\n",
       "      <td>b'plic'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7062</th>\n",
       "      <td>41590</td>\n",
       "      <td>b'icat'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8497</th>\n",
       "      <td>41588</td>\n",
       "      <td>b'appl'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2496</th>\n",
       "      <td>41588</td>\n",
       "      <td>b'cati'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>548</th>\n",
       "      <td>41586</td>\n",
       "      <td>b'ion/'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9397</th>\n",
       "      <td>41586</td>\n",
       "      <td>b'on/x'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3910</th>\n",
       "      <td>41586</td>\n",
       "      <td>b'ppli'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5678</th>\n",
       "      <td>37453</td>\n",
       "      <td>b'host'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1811</th>\n",
       "      <td>37443</td>\n",
       "      <td>b'8080'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11466</th>\n",
       "      <td>37437</td>\n",
       "      <td>b'alho'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5312</th>\n",
       "      <td>37437</td>\n",
       "      <td>b'lhos'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>399</th>\n",
       "      <td>37435</td>\n",
       "      <td>b'ocal'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6020</th>\n",
       "      <td>37435</td>\n",
       "      <td>b'loca'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9354</th>\n",
       "      <td>37434</td>\n",
       "      <td>b'late'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8334</th>\n",
       "      <td>37430</td>\n",
       "      <td>b'cach'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6217</th>\n",
       "      <td>88</td>\n",
       "      <td>b'D5\\r\\n'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6162</th>\n",
       "      <td>88</td>\n",
       "      <td>b'ranz'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6246</th>\n",
       "      <td>88</td>\n",
       "      <td>b'44AC'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6267</th>\n",
       "      <td>88</td>\n",
       "      <td>b'26AB'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6265</th>\n",
       "      <td>88</td>\n",
       "      <td>b'gs%4'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6264</th>\n",
       "      <td>88</td>\n",
       "      <td>b'=Ch9'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6262</th>\n",
       "      <td>88</td>\n",
       "      <td>b'D4C3'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6261</th>\n",
       "      <td>88</td>\n",
       "      <td>b'un&amp;p'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6259</th>\n",
       "      <td>88</td>\n",
       "      <td>b'AF91'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6257</th>\n",
       "      <td>88</td>\n",
       "      <td>b'24FE'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6255</th>\n",
       "      <td>88</td>\n",
       "      <td>b'5E37'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6253</th>\n",
       "      <td>88</td>\n",
       "      <td>b'7ECA'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6252</th>\n",
       "      <td>88</td>\n",
       "      <td>b'n=Po'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6251</th>\n",
       "      <td>88</td>\n",
       "      <td>b'Pale'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6250</th>\n",
       "      <td>88</td>\n",
       "      <td>b'5EB2'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6248</th>\n",
       "      <td>88</td>\n",
       "      <td>b'=E2E'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6247</th>\n",
       "      <td>88</td>\n",
       "      <td>b'5495'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6244</th>\n",
       "      <td>88</td>\n",
       "      <td>b'5954'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6218</th>\n",
       "      <td>88</td>\n",
       "      <td>b'578D'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6243</th>\n",
       "      <td>88</td>\n",
       "      <td>b'24E0'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6242</th>\n",
       "      <td>88</td>\n",
       "      <td>b'AB5B'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6241</th>\n",
       "      <td>88</td>\n",
       "      <td>b'23B0'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6239</th>\n",
       "      <td>88</td>\n",
       "      <td>b'D4EC'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6238</th>\n",
       "      <td>88</td>\n",
       "      <td>b'00DA'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6237</th>\n",
       "      <td>88</td>\n",
       "      <td>b'57C3'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6236</th>\n",
       "      <td>88</td>\n",
       "      <td>b'DAD1'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6235</th>\n",
       "      <td>88</td>\n",
       "      <td>b'A697'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6234</th>\n",
       "      <td>88</td>\n",
       "      <td>b'ullu'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6231</th>\n",
       "      <td>88</td>\n",
       "      <td>b'9213'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6229</th>\n",
       "      <td>88</td>\n",
       "      <td>b'a.+M'</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       counter     n-gram\n",
       "8900     93565    b'q=0.'\n",
       "8657     93565    b';q=0'\n",
       "11474    74856    b'cept'\n",
       "5501     74852    b'Acce'\n",
       "4476     74852    b'ccep'\n",
       "2849     60320    b'tion'\n",
       "10750    56165    b'ache'\n",
       "6633     56139    b'ept-'\n",
       "4399     56139    b'=0.5'\n",
       "4795     56139    b'ext/'\n",
       "10115    56139    b'xml,'\n",
       "7927     56139    b'text'\n",
       "10200    56137    b'ost:'\n",
       "1008     41650    b'lica'\n",
       "11236    41606    b'atio'\n",
       "624      41592    b'plic'\n",
       "7062     41590    b'icat'\n",
       "8497     41588    b'appl'\n",
       "2496     41588    b'cati'\n",
       "548      41586    b'ion/'\n",
       "9397     41586    b'on/x'\n",
       "3910     41586    b'ppli'\n",
       "5678     37453    b'host'\n",
       "1811     37443    b'8080'\n",
       "11466    37437    b'alho'\n",
       "5312     37437    b'lhos'\n",
       "399      37435    b'ocal'\n",
       "6020     37435    b'loca'\n",
       "9354     37434    b'late'\n",
       "8334     37430    b'cach'\n",
       "...        ...        ...\n",
       "6217        88  b'D5\\r\\n'\n",
       "6162        88    b'ranz'\n",
       "6246        88    b'44AC'\n",
       "6267        88    b'26AB'\n",
       "6265        88    b'gs%4'\n",
       "6264        88    b'=Ch9'\n",
       "6262        88    b'D4C3'\n",
       "6261        88    b'un&p'\n",
       "6259        88    b'AF91'\n",
       "6257        88    b'24FE'\n",
       "6255        88    b'5E37'\n",
       "6253        88    b'7ECA'\n",
       "6252        88    b'n=Po'\n",
       "6251        88    b'Pale'\n",
       "6250        88    b'5EB2'\n",
       "6248        88    b'=E2E'\n",
       "6247        88    b'5495'\n",
       "6244        88    b'5954'\n",
       "6218        88    b'578D'\n",
       "6243        88    b'24E0'\n",
       "6242        88    b'AB5B'\n",
       "6241        88    b'23B0'\n",
       "6239        88    b'D4EC'\n",
       "6238        88    b'00DA'\n",
       "6237        88    b'57C3'\n",
       "6236        88    b'DAD1'\n",
       "6235        88    b'A697'\n",
       "6234        88    b'ullu'\n",
       "6231        88    b'9213'\n",
       "6229        88    b'a.+M'\n",
       "\n",
       "[10000 rows x 2 columns]"
      ]
     },
     "execution_count": 224,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "topk2 = topk(\"bn.newest.npy\", 10000) #topk_of_10k\n",
    "s2 = set([i for i in topk2['n-gram']])\n",
    "print(\"len(s2)=%ld\" % len(s2))\n",
    "print(\"len(s2.intersection(s1))=%ld\" % len(s2.intersection(s1)))\n",
    "topk2[topk2['counter'] > 10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9783"
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(s1.intersection(s2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8         b'340&'\n",
       "13        b'9973'\n",
       "41        b'o+S%'\n",
       "51        b'asia'\n",
       "54        b'n%E9'\n",
       "57        b'6824'\n",
       "61        b'a+Se'\n",
       "87        b're=O'\n",
       "113       b'jpg '\n",
       "123       b'=361'\n",
       "141       b'0427'\n",
       "150       b'FF&c'\n",
       "158       b'rlos'\n",
       "159       b'264F'\n",
       "166       b'AF35'\n",
       "176       b'ras&'\n",
       "182       b'6025'\n",
       "186       b'6924'\n",
       "196       b'7606'\n",
       "199       b'C17E'\n",
       "234       b'112&'\n",
       "237       b'ana+'\n",
       "255       b'B893'\n",
       "285       b'3244'\n",
       "289       b'as.j'\n",
       "291       b'51&B'\n",
       "302       b'140&'\n",
       "315       b'8795'\n",
       "322       b'-gzi'\n",
       "324       b'el&a'\n",
       "           ...   \n",
       "121438    b'4921'\n",
       "121450    b'5, *'\n",
       "121490    b'hero'\n",
       "121507    b'tc=8'\n",
       "121508    b'7479'\n",
       "121513    b'adri'\n",
       "121514    b'3%2C'\n",
       "121522    b'7386'\n",
       "121526    b'C+S%'\n",
       "121549    b'n=Pa'\n",
       "121550    b'2814'\n",
       "121554    b'cos&'\n",
       "121597    b'c=42'\n",
       "121604    b'%28I'\n",
       "121609    b'so+M'\n",
       "121623    b'439F'\n",
       "121626    b'1084'\n",
       "121637    b'4928'\n",
       "121649    b'lla&'\n",
       "121658    b'40vi'\n",
       "121663    b'5CA7'\n",
       "121665    b'7963'\n",
       "121688    b'8094'\n",
       "121698    b'inco'\n",
       "121739    b'dir+'\n",
       "121769    b'6038'\n",
       "121789    b'an%E'\n",
       "121810    b'8924'\n",
       "121811    b'0643'\n",
       "121823    b'22FA'\n",
       "Name: n-gram, Length: 9184, dtype: object"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s1.loc[s1.index.difference(s2.index)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "34395     b'ache'\n",
       "55167     b'=0.5'\n",
       "55025     b'icat'\n",
       "82627     b'ppli'\n",
       "59110     b'cach'\n",
       "45972     b'html'\n",
       "82094     b'gzip'\n",
       "22896     b'imag'\n",
       "53903     b'quer'\n",
       "9325      b'ible'\n",
       "45481     b': te'\n",
       "98538     b'da1/'\n",
       "24046     b'-Enc'\n",
       "30483     b'e/pn'\n",
       "89268     b'3.5.'\n",
       "89156     b'ol: '\n",
       "50643     b'ble;'\n",
       "62265     b': x-'\n",
       "78821    b'1.1\\r'\n",
       "33545     b't-La'\n",
       "47457     b'.8,i'\n",
       "42938     b', ut'\n",
       "22836     b'p://'\n",
       "59922     b'/xht'\n",
       "38529     b'0 (c'\n",
       "61628     b'TML/'\n",
       "26301     b'xt/p'\n",
       "63411     b'ain;'\n",
       "97213     b'Conn'\n",
       "96752     b'ion:'\n",
       "           ...   \n",
       "53347     b'5620'\n",
       "81859     b'2245'\n",
       "43335     b'8808'\n",
       "32303     b'6225'\n",
       "41542     b'DF83'\n",
       "5091      b'B7AC'\n",
       "5308      b'C2AE'\n",
       "96883     b'2781'\n",
       "72091     b'5A3A'\n",
       "63800     b'E7AA'\n",
       "77642     b'77D1'\n",
       "22984     b'7741'\n",
       "53223     b'9138'\n",
       "61270     b'0943'\n",
       "93034     b'E693'\n",
       "81831     b'B4B4'\n",
       "36148     b'3769'\n",
       "36129     b'8272'\n",
       "7056      b'D156'\n",
       "79856     b'6580'\n",
       "79934     b'0494'\n",
       "97065     b'1386'\n",
       "90948     b'6637'\n",
       "93393     b'DA00'\n",
       "57734     b'5587'\n",
       "36256     b'9209'\n",
       "80127     b'D721'\n",
       "92619     b'ACF1'\n",
       "28244     b'8D4E'\n",
       "95739     b'9373'\n",
       "Name: n-gram, Length: 816, dtype: object"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s1.loc[s1.index.intersection(s2.index)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2         b'ruel'\n",
       "12        b'i=81'\n",
       "32        b'25F2'\n",
       "34        b'i=60'\n",
       "45        b'0343'\n",
       "77        b't: l'\n",
       "80        b'5313'\n",
       "82        b'r%E9'\n",
       "92        b'0345'\n",
       "114       b'9491'\n",
       "115       b'e=Br'\n",
       "122       b'D=F2'\n",
       "133       b'D=93'\n",
       "138       b'9973'\n",
       "139       b'edon'\n",
       "148       b'4547'\n",
       "151       b'N+&c'\n",
       "162       b'0161'\n",
       "164       b'AD30'\n",
       "189       b'D=13'\n",
       "198       b'eo&n'\n",
       "210       b'o+Vi'\n",
       "211       b'n=pe'\n",
       "217       b'ara%'\n",
       "220       b'911E'\n",
       "221       b'9185'\n",
       "231       b'+Esp'\n",
       "250       b'e=Si'\n",
       "271       b'40th'\n",
       "274       b'SSIO'\n",
       "           ...   \n",
       "98655     b'ltra'\n",
       "98677     b'1B22'\n",
       "98692     b'=376'\n",
       "98694     b'66&B'\n",
       "98698     b'Manu'\n",
       "98703     b'=177'\n",
       "98706     b'7438'\n",
       "98733     b'F32B'\n",
       "98760     b'F97D'\n",
       "98766     b'd&dn'\n",
       "98798     b'1721'\n",
       "98806     b'=371'\n",
       "98819     b'D270'\n",
       "98825     b'7612'\n",
       "98834     b'isti'\n",
       "98850     b'Mozi'\n",
       "98855     b'8924'\n",
       "98860     b'0184'\n",
       "98861     b'xt/x'\n",
       "98875     b'8CE3'\n",
       "98881     b'Pe%F'\n",
       "98889     b'7258'\n",
       "98890     b'B572'\n",
       "98929     b'c=36'\n",
       "98938    b'248\\r'\n",
       "98955     b'22FA'\n",
       "98956     b'defl'\n",
       "98967     b'805D'\n",
       "98972     b'=kar'\n",
       "98978     b'trar'\n",
       "Name: n-gram, Length: 9184, dtype: object"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s2.loc[s2.index.difference(s1.index)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "24539    b'ext/'\n",
       "90304    b'atio'\n",
       "61473    b'l,ap'\n",
       "24046    b',app'\n",
       "25925    b't:80'\n",
       "30483    b'nque'\n",
       "12429    b'agma'\n",
       "41646    b'SION'\n",
       "82892    b'ge: '\n",
       "59784    b'er-A'\n",
       "52197    b'gma:'\n",
       "52735    b', ut'\n",
       "73181    b'ml;q'\n",
       "53527    b'or/3'\n",
       "22166    b'ML/3'\n",
       "36687    b'pt-L'\n",
       "36555    b'g,*/'\n",
       "38706    b'ge/p'\n",
       "63619    b'l+xm'\n",
       "38272    b'HTTP'\n",
       "93810    b'TP/1'\n",
       "95136    b'=0.8'\n",
       "31435    b'-8;q'\n",
       "33223    b'nux)'\n",
       "96451    b'0.5,'\n",
       "86584    b'ept:'\n",
       "90893    b'ion:'\n",
       "70016    b'on: '\n",
       "95570    b'blic'\n",
       "79027    b'gist'\n",
       "          ...   \n",
       "6189     b'2198'\n",
       "81211    b'6612'\n",
       "96977    b'776E'\n",
       "6724     b'rd=G'\n",
       "79414    b'00F1'\n",
       "82794    b'38EC'\n",
       "45481    b'D05C'\n",
       "45203    b'BC99'\n",
       "22096    b'8114'\n",
       "88387    b'0F70'\n",
       "96751    b'1761'\n",
       "89504    b'7347'\n",
       "91454    b'8D4E'\n",
       "22984    b'3C56'\n",
       "8552     b'79FA'\n",
       "42067    b'1511'\n",
       "42076    b'+Pen'\n",
       "8399     b'6131'\n",
       "41102    b'7915'\n",
       "91389    b'74E8'\n",
       "2549     b'BFF8'\n",
       "86478    b'AD16'\n",
       "42240    b'5835'\n",
       "15427    b'71E6'\n",
       "22896    b'7E5C'\n",
       "56124    b'4189'\n",
       "86215    b'C5AC'\n",
       "40384    b'245D'\n",
       "74343    b'0985'\n",
       "409      b'ries'\n",
       "Name: n-gram, Length: 816, dtype: object"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s2.loc[s2.index.intersection(s1.index)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
